# This script loads the clean datasets and prepares the output tables

#Please open the R-Project "Replication_STC.Rproj" first and load the "README.Rmd" to set the working directory.
getwd()

rm(list=ls())

library(tidyverse)
'%!in%' <- function(x,y)!('%in%'(x,y))

# ... HS ####

# load correspondence tables ####

# correspondence tables for services
cor_services <- list(readxl::read_xlsx("ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-class", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-group", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-division", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-class", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-group", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-division", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-class", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-group", na = "NA"),
                     readxl::read_xlsx("ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-division", na = "NA"))
names(cor_services) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# correspondence tables for goods
load("ancillary_data/correspondence_ISIC_to_HS.rds")

cor_goods_hs6 <- corr
rm(corr)

cor <- list()
for (i in 1:length(cor_goods_hs6)) {
  cor[[i]] <- cor_goods_hs6[[i]] %>%
    filter(ISIC_code <= c(4100, 410, 41, 4100, 410, 41, 3900, 390, 39)[i]) %>% # exclude services sectors
    bind_rows(., cor_services[[i]] %>%
                mutate(ISIC_code = ISIC_code) %>%
                filter(ISIC_code > c(4100, 410, 41, 4100, 410, 41, 3900, 390, 39)[i]) # exclude non-services sectors
              ) %>% 
    mutate(cor_code = ifelse(is.na(hs17), as.character(EBOPS_code), as.character(hs17))) %>%
    dplyr::select("ISIC_code", "cor_code")
}
names(cor) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# cor <- cor[c("ISIC3g", "ISIC31g", "ISIC4g")] # since we only use group level data, we do not need the other correspondence tables

hs_codes_I3c <- unique(sort(cor[["ISIC3g"]]$cor_code))
hs_codes_I4c <- unique(sort(cor[["ISIC4g"]]$cor_code))

'%!in%' <- function(x,y)!('%in%'(x,y))

hs_codes_I4c[which(hs_codes_I4c %!in% hs_codes_I3c)]
#Note: ISIC 3c is more conservative, most differences are primary. Also for consistency reasons

# load Trade data ####

# load EBOPS for services
load("not_shareable/serv_clean.rds") 
#this data source cannot be shared due to legal reasons. it is obtainable through https://www.wto.org/english/res_e/statis_e/trade_datasets_e.htm
#it is necessary to download XXX

trade_services <- serv_grid %>%
  filter(EBOPS_2002 %in% c("S205", "S236", "S245", "S249", "S253", "S260", "S262", "S268", "S287", "S291")) %>%
  mutate(value = value * 1e+6, 
         cor_code = as.character(EBOPS_2002)) %>%
  dplyr::select(c("country_short", "year", "flow", "cor_code", "value")) %>%
  filter(!is.na(country_short))

rm(serv_grid)

# load HS6 for goods
load("not_shareable/hs6_clean.rds")
#this data source cannot be shared but the data are obtainable through https://comtrade.un.org/data
#it is necessary to download all trade flows for all countries in hs6 since 1999

# combine goods and services data
trade_data <- hs_grid %>%
  ungroup() %>%
  mutate(cor_code = as.character(HS17)) %>%
  filter(cor_code %in% hs_codes_I3c) %>%
  bind_rows(., trade_services) %>%
  dplyr::select(c(country_short, year, flow, cor_code, value))

rm(hs_grid, hs_codes_I3c, hs_codes_I4c)

#### Calculate World and Country Exports and Imports ####

# World Total Exports
world_total_ex <- trade_data %>%
  filter(flow == "Export") %>%
  group_by(year) %>%
  summarise(world_total_ex_hs = sum(value, na.rm=T))

# Country Total Exports
cntry_total_ex <- trade_data %>%
  filter(flow == "Export") %>%
  group_by(country_short, year) %>%
  summarise(cntry_total_ex_hs = sum(value, na.rm=T))

# World Total Import
world_total_im <- trade_data %>%
  filter(flow == "Import") %>%
  group_by(year) %>%
  summarise(world_total_im_hs = sum(value, na.rm=T))

# Country Total Import
cntry_total_im <- trade_data %>%
  filter(flow == "Import") %>%
  group_by(country_short, year) %>%
  summarise(cntry_total_im_hs = sum(value, na.rm=T))

# World ISIC Exports
world_ISIC_ex <- list()
for (i in 1:length(cor)) {
  world_ISIC_ex[[i]] <- dplyr::left_join(cor[[i]], trade_data, by = "cor_code") %>%
    filter(flow == "Export") %>%
    group_by(year, ISIC_code) %>%
    summarize(world_ISIC_ex_hs = sum(value, na.rm = T))
}
names(world_ISIC_ex) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# World ISIC Imports
world_ISIC_im <- list()
for (i in 1:length(cor)) {
  world_ISIC_im[[i]] <- dplyr::left_join(cor[[i]], trade_data, by = "cor_code") %>%
    filter(flow == "Import") %>%
    group_by(year, ISIC_code) %>%
    summarize(world_ISIC_im_hs = sum(value, na.rm = T))
}
names(world_ISIC_im) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# Country ISIC Exports
cntry_ISIC_ex <- list()
for (i in 1:length(cor)) {
  cntry_ISIC_ex[[i]] <- dplyr::left_join(cor[[i]], trade_data, by = "cor_code") %>%
    filter(flow == "Export") %>%
    group_by(country_short, year, ISIC_code) %>%
    summarize(cntry_ISIC_ex_hs = sum(value, na.rm = T))
  
  print(paste0("Exports: ",i,"/",length(cor)))
}
names(cntry_ISIC_ex) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# Country ISIC Imports
cntry_ISIC_im <- list()
for (i in 1:length(cor)) {
  cntry_ISIC_im[[i]] <- dplyr::left_join(cor[[i]], trade_data, "cor_code") %>%
    filter(flow == "Import") %>%
    group_by(country_short, year, ISIC_code) %>%
    summarize(cntry_ISIC_im_hs = sum(value, na.rm = T))
  
  print(paste0("Imports: ",i,"/",length(cor)))
}

names(cntry_ISIC_im) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

#### Calculate RCA ####

# via ISIC categories
rca <- list()
for (i in 1:length(cntry_ISIC_ex)) {
  rca[[i]] <- cntry_ISIC_ex[[i]] %>%
    dplyr::left_join(., cntry_ISIC_im[[i]], by = c("country_short", "year", "ISIC_code")) %>%
    dplyr::left_join(., cntry_total_ex, by = c("country_short", "year")) %>%
    dplyr::left_join(., cntry_total_im, by = c("country_short", "year")) %>%
    dplyr::left_join(., world_ISIC_ex[[i]], by = c("ISIC_code", "year")) %>%
    dplyr::left_join(., world_ISIC_im[[i]], by = c("ISIC_code", "year")) %>%
    dplyr::left_join(., world_total_ex, by = c("year")) %>%
    dplyr::left_join(., world_total_im, by = c("year")) %>%
    filter(cntry_ISIC_ex_hs >= 0) %>%
    mutate(ex_share_country = cntry_ISIC_ex_hs / (cntry_total_ex_hs - cntry_ISIC_ex_hs),
           ex_share_world = world_ISIC_ex_hs / (world_total_ex_hs - world_ISIC_ex_hs),
           im_share_country = cntry_ISIC_im_hs / (cntry_total_im_hs - cntry_ISIC_im_hs),
           im_share_world = world_ISIC_im_hs / (world_total_im_hs - world_ISIC_im_hs),
           rxa_raw = ex_share_country/ex_share_world,
           rma_raw = im_share_country/im_share_world,
           rca_sym = (rxa_raw - 1) / (rxa_raw + 1),
           rca_add = (cntry_ISIC_ex_hs / cntry_total_ex_hs) - (world_ISIC_ex_hs / world_total_ex_hs),
           rca_net = (((rxa_raw -1 ) / (rxa_raw + 1)) - ((rma_raw - 1) / (rma_raw + 1))) / 2,
           rca_tba = (cntry_ISIC_ex_hs - cntry_ISIC_im_hs) / (cntry_ISIC_ex_hs + cntry_ISIC_im_hs))
  print(paste0("RCA: ",i,"/",length(cntry_ISIC_ex)))
}
names(rca) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

cor(rca$ISIC3c[c("rca_sym", "rca_add", "rca_net", "rca_tba")], use = "complete.obs")

rm(list = setdiff(ls(), c(ls(pattern = "rca"))))

saveRDS(rca, file = "rca.rds")

openxlsx::write.xlsx(rca, "rca.xlsx")

#### ... Labour Surveys ####

# Sector limits
sectors <- readxl::read_excel("ancillary_data/industry_groups.xlsx", na = "NA", sheet = "manu") # Sheet 'manu' differentiates between high- and low-tech manufacturing. Sheet 'totl' does not.
ISIC_schemes <- list(ISIC3_names = readxl::read_excel("ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC3_names"),
                     ISIC31_names = readxl::read_excel("ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC31_names"),
                     ISIC4_names = readxl::read_excel("ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC4_names"))
inputFiles <- paste0("./not_shareable/surveys/", list.files(pattern = "*.rds", path = "./not_shareable/surveys/")) 
inputFiles <- inputFiles[-which(inputFiles == "./not_shareable/surveys/keys.rds")]

stc <- data_frame()

stc_raw <- data_frame()

for (i in 1:length(inputFiles)) {

    #### Load Labor and Household Surveys ####
  
  dat_raw <- readRDS(inputFiles[i])
  tempCountry <- unique(dat_raw$country_short)
  tempLevel <- unique(dat_raw$subnational_level)
  
  print(paste0("Starting: ",i,"/",length(inputFiles), "; ", tempCountry, " ", tempLevel))
  
  # Transform class level information into group level information
  if ("ISIC3_class" %in% names(dat_raw)) {dat_raw$ISIC3_group = substr(dat_raw$ISIC3_class, 1, nchar(dat_raw$ISIC3_class)-1)}
  if ("ISIC31_class" %in% names(dat_raw)) {dat_raw$ISIC31_group = substr(dat_raw$ISIC31_class, 1, nchar(dat_raw$ISIC31_class)-1)}
  if ("ISIC4_class" %in% names(dat_raw)) {dat_raw$ISIC4_group = substr(dat_raw$ISIC4_class, 1, nchar(dat_raw$ISIC4_class)-1)}
  
  dat <- dat_raw %>%
    dplyr::select(-contains("class"), -contains("division")) %>%
    gather(-c("country_short", "subnational_level", "subnational_code", "subnational_name", "year", "count", "weight"), key = "ISIC_coding", value = "ISIC_code") %>%
    mutate(ISIC_code = as.numeric(as.character(ISIC_code))) %>%
    #drop_na() %>%
    group_by(country_short, subnational_level, subnational_code, subnational_name, year, ISIC_coding, ISIC_code) %>%
    summarise(count = sum(count, na.rm=T),
              weight = sum(weight, na.rm=T)) %>%
    ungroup()
  
  dat_grid <- expand.grid(subnational_code = unique(dat$subnational_code),
                          year = 1999:2019,
                          ISIC = unique(paste0(dat$ISIC_coding, "-", dat$ISIC_code))) %>%
    dplyr::left_join(., dat %>% dplyr::select(country_short, subnational_level, subnational_code, subnational_name) %>% distinct(), by = "subnational_code")
  
  dat_fill <- dplyr::left_join(dat_grid, dat %>%
                                 mutate(ISIC = paste0(ISIC_coding, "-", ISIC_code)),# %>%
                                 #drop_na(),
                               by = c("country_short", "subnational_level", "subnational_code", "subnational_name", "year", "ISIC")) %>%
    arrange(subnational_code, year, ISIC) %>%
    group_by(subnational_code, ISIC) %>%
    mutate(count1 = zoo::na.approx(count, na.rm = F),
           weight1 = zoo::na.approx(weight, na.rm = F),
           count2 = zoo::na.locf(count, na.rm = F),
           count2 = ifelse(is.na(count2), zoo::na.locf(count, na.rm = F, fromLast = T), count2),
           weight2 = zoo::na.locf(weight, na.rm = F),
           weight2 = ifelse(is.na(weight2), zoo::na.locf(weight, na.rm = F, fromLast = T), weight2),
           estimated = ifelse(is.na(count2), NA, 
                              ifelse(is.na(count1), 2,
                                     ifelse(is.na(count), 1, 0))),
           count = ifelse(!is.na(count), count,
                          ifelse(!is.na(count1), count1, count2)),
           weight = ifelse(!is.na(weight), weight,
                           ifelse(!is.na(weight1), weight1, weight2)),
           ISIC_coding = gsub("\\-.*", "", ISIC),
           ISIC_code = as.numeric(as.character(gsub(".*-", "", ISIC)))) %>%
    dplyr::left_join(., readxl::read_excel("./ancillary_data/correct_coding_scheme.xlsx") %>%
                       mutate(ISIC_coding_correct = paste0(ISIC_coding_correct, "_group")),
                     by = c("country_short", "subnational_level", "year")) %>%
    filter(ISIC_coding == ISIC_coding_correct) %>%
    #filter(!(estimated != 0 & labour_survey == 1)) %>% # remove "true NAs"; ISIC codes without reported respondents in this subnational unit in this year
    ungroup() %>%
    dplyr::select(country_short, subnational_level, subnational_code, subnational_name, year, ISIC_coding, ISIC_code, count, weight, estimated) %>%
    dplyr::left_join(., sectors, by = c("ISIC_coding", "ISIC_code")) %>% 
    mutate(weight = ifelse(is.na(weight), 0, weight))
  
  #### Merge Labour Data with RCA and Trade Ratio ####
  
  dat_list <- list(
    ISIC3g = subset(dat_fill, ISIC_coding == "ISIC3_group") %>%
      dplyr::left_join(., rca[["ISIC3g"]], by = c("country_short", "ISIC_code", "year")) %>%
      dplyr::left_join(., ISIC_schemes[["ISIC3_names"]], by = c("ISIC_code" = "ISIC3_group")) %>%
      mutate(ISIC_name = as.character(ISIC_name)), 
    ISIC31g = subset(dat_fill, ISIC_coding == "ISIC31_group") %>%
      dplyr::left_join(., rca[["ISIC31g"]], by = c("country_short", "ISIC_code", "year")) %>%
      dplyr::left_join(., ISIC_schemes[["ISIC31_names"]], by = c("ISIC_code" = "ISIC31_group")) %>%
      mutate(ISIC_name = as.character(ISIC_name)), 
    ISIC4g = subset(dat_fill, ISIC_coding == "ISIC4_group") %>%
      dplyr::left_join(., rca[["ISIC4g"]], by = c("country_short", "ISIC_code", "year")) %>%
      dplyr::left_join(., ISIC_schemes[["ISIC4_names"]], by = c("ISIC_code" = "ISIC4_group")) %>%
      mutate(ISIC_name = as.character(ISIC_name))
  )
  
  dat_comp <- dplyr::bind_rows(dat_list) %>%
    mutate(rxa_raw = ifelse(rxa_raw %in% c(-Inf, Inf), NA, rxa_raw),
           rca_sym = ifelse(rca_sym %in% c(-Inf, Inf), NA, rca_sym),
           rca_add = ifelse(rca_add %in% c(-Inf, Inf), NA, rca_add),
           rca_net = ifelse(rca_net %in% c(-Inf, Inf), NA, rca_net),
           rca_tba = ifelse(rca_tba %in% c(-Inf, Inf), NA, rca_tba)) %>%
    dplyr::select(country_short, subnational_level, subnational_code, subnational_name, year, ISIC_coding, ISIC_code, ISIC_name, count, weight, estimated, sector,
                  contains("cntry_"), contains("world_"), contains("ex_"), contains("im_"), rxa_raw, rma_raw, contains("rca_"))
  
  #### Aggregate to Sector Level 
  
  stc_country <- dat_comp  %>%
    group_by(country_short, subnational_level, year, ISIC_coding) %>%
    summarise(stc_sym_trad_nat = questionr::wtd.mean(x = rca_sym[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_add_trad_nat = questionr::wtd.mean(x = rca_add[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_net_trad_nat = questionr::wtd.mean(x = rca_net[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_tba_trad_nat = questionr::wtd.mean(x = rca_tba[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T))
  
  stc_subnational <- dat_comp  %>%
    mutate(count_agri = count * ifelse(sector == "agri", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_mini = count * ifelse(sector == "mini", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_manu = count * ifelse(sector %in% c("manu_lotech", "manu_hitech"), 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_malt = count * ifelse(sector == "manu_lotech", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_maht = count * ifelse(sector == "manu_hitech", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_good = count * ifelse(sector %in% c("agri", "mini", "manu_lotech", "manu_hitech"), 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_sr_n = count * ifelse(sector == "sr_n", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_sr_t = count * ifelse(sector == "sr_t", 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_serv = count * ifelse(sector %in% c("sr_n", "sr_t"), 1, 0) * ifelse(is.na(rca_tba), 0,1),
           count_trad = count * ifelse(sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t"), 1, 0) * ifelse(is.na(rca_tba), 0,1)) %>%
    group_by(country_short, subnational_level, subnational_code, subnational_name, year, ISIC_coding) %>%
    summarise(n_sectors = sum(ifelse(weight == 0, 0,1) * ifelse(is.na(rca_tba), 0,1)),
              estimated = mean(estimated, na.rm = T),
              count = sum(count, na.rm=T),
              count_agri = sum(count_agri, na.rm=T),
              count_mini = sum(count_mini, na.rm=T),
              count_manu = sum(count_manu, na.rm=T),
              count_malt = sum(count_malt, na.rm=T),
              count_maht = sum(count_maht, na.rm=T),
              count_good = sum(count_good, na.rm=T),
              count_sr_n = sum(count_sr_n, na.rm=T),
              count_sr_t = sum(count_sr_t, na.rm=T),
              count_serv = sum(count_serv, na.rm=T),
              count_trad = sum(count_trad, na.rm=T),
              weighted_count = sum(weight, na.rm=T),
              share_agri = questionr::wtd.mean(x = (sector == "agri" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_mini = questionr::wtd.mean(x = (sector == "mini" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_manu = questionr::wtd.mean(x = (sector %in% c("manu_lotech", "manu_hitech") & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_malt = questionr::wtd.mean(x = (sector == "manu_lotech" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_maht = questionr::wtd.mean(x = (sector == "manu_hitech" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_good = questionr::wtd.mean(x = (sector %in% c("agri", "mini", "manu_lotech", "manu_hitech") & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_sr_n = questionr::wtd.mean(x = (sector == "sr_n" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_sr_t = questionr::wtd.mean(x = (sector == "sr_t" & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_serv = questionr::wtd.mean(x = (sector %in% c("sr_n", "sr_t") & !is.na(rca_tba)), weights = weight,  na.rm = T),
              share_trad = questionr::wtd.mean(x = (sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t") & !is.na(rca_tba)), weights = weight,  na.rm = T),
              
              stc_sym_trad_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_add_trad_sub = questionr::wtd.mean(x = rca_add[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_net_trad_sub = questionr::wtd.mean(x = rca_net[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              stc_tba_trad_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu_lotech", "manu_hitech", "sr_t")],  na.rm = T),
              
              stc_sym_agri_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("agri")], weights = weight[sector %in% c("agri")],  na.rm = T),
              stc_add_agri_sub = questionr::wtd.mean(x = rca_add[sector %in% c("agri")], weights = weight[sector %in% c("agri")],  na.rm = T),
              stc_net_agri_sub = questionr::wtd.mean(x = rca_net[sector %in% c("agri")], weights = weight[sector %in% c("agri")],  na.rm = T),
              stc_tba_agri_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("agri")], weights = weight[sector %in% c("agri")],  na.rm = T),
              
              stc_sym_mini_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("mini")], weights = weight[sector %in% c("mini")],  na.rm = T),
              stc_add_mini_sub = questionr::wtd.mean(x = rca_add[sector %in% c("mini")], weights = weight[sector %in% c("mini")],  na.rm = T),
              stc_net_mini_sub = questionr::wtd.mean(x = rca_net[sector %in% c("mini")], weights = weight[sector %in% c("mini")],  na.rm = T),
              stc_tba_mini_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("mini")], weights = weight[sector %in% c("mini")],  na.rm = T),
              
              stc_sym_manu_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("manu_lotech", "manu_hitech")], weights = weight[sector %in% c("manu_lotech", "manu_hitech")],  na.rm = T),
              stc_add_manu_sub = questionr::wtd.mean(x = rca_add[sector %in% c("manu_lotech", "manu_hitech")], weights = weight[sector %in% c("manu_lotech", "manu_hitech")],  na.rm = T),
              stc_net_manu_sub = questionr::wtd.mean(x = rca_net[sector %in% c("manu_lotech", "manu_hitech")], weights = weight[sector %in% c("manu_lotech", "manu_hitech")],  na.rm = T),
              stc_tba_manu_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("manu_lotech", "manu_hitech")], weights = weight[sector %in% c("manu_lotech", "manu_hitech")],  na.rm = T),
              
              stc_sym_malt_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("manu_lotech")], weights = weight[sector %in% c("manu_lotech")],  na.rm = T),
              stc_add_malt_sub = questionr::wtd.mean(x = rca_add[sector %in% c("manu_lotech")], weights = weight[sector %in% c("manu_lotech")],  na.rm = T),
              stc_net_malt_sub = questionr::wtd.mean(x = rca_net[sector %in% c("manu_lotech")], weights = weight[sector %in% c("manu_lotech")],  na.rm = T),
              stc_tba_malt_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("manu_lotech")], weights = weight[sector %in% c("manu_lotech")],  na.rm = T),
              
              stc_sym_maht_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("manu_hitech")], weights = weight[sector %in% c("manu_hitech")],  na.rm = T),
              stc_add_maht_sub = questionr::wtd.mean(x = rca_add[sector %in% c("manu_hitech")], weights = weight[sector %in% c("manu_hitech")],  na.rm = T),
              stc_net_maht_sub = questionr::wtd.mean(x = rca_net[sector %in% c("manu_hitech")], weights = weight[sector %in% c("manu_hitech")],  na.rm = T),
              stc_tba_maht_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("manu_hitech")], weights = weight[sector %in% c("manu_hitech")],  na.rm = T),
              
              stc_sym_sr_t_sub = questionr::wtd.mean(x = rca_sym[sector %in% c("sr_t")], weights = weight[sector %in% c("sr_t")],  na.rm = T),
              stc_add_sr_t_sub = questionr::wtd.mean(x = rca_add[sector %in% c("sr_t")], weights = weight[sector %in% c("sr_t")],  na.rm = T),
              stc_net_sr_t_sub = questionr::wtd.mean(x = rca_net[sector %in% c("sr_t")], weights = weight[sector %in% c("sr_t")],  na.rm = T),
              stc_tba_sr_t_sub = questionr::wtd.mean(x = rca_tba[sector %in% c("sr_t")], weights = weight[sector %in% c("sr_t")],  na.rm = T))
  
  stc_combined <- dplyr::left_join(stc_subnational, stc_country) %>% # subtract national stc from subnational stc
    mutate(stc_sym = stc_sym_trad_sub - stc_sym_trad_nat, 
           stc_add = stc_add_trad_sub - stc_add_trad_nat,
           stc_net = stc_net_trad_sub - stc_net_trad_nat,
           stc_tba = stc_tba_trad_sub - stc_tba_trad_nat,
           
           stc_sym_agri = stc_sym_agri_sub - stc_sym_trad_nat,
           stc_add_agri = stc_add_agri_sub - stc_add_trad_nat,
           stc_net_agri = stc_net_agri_sub - stc_net_trad_nat,
           stc_tba_agri = stc_tba_agri_sub - stc_tba_trad_nat,
           
           stc_sym_mini = stc_sym_mini_sub - stc_sym_trad_nat,
           stc_add_mini = stc_add_mini_sub - stc_add_trad_nat, 
           stc_net_mini = stc_net_mini_sub - stc_net_trad_nat,
           stc_tba_mini = stc_tba_mini_sub - stc_tba_trad_nat,
           
           stc_sym_manu = stc_sym_manu_sub - stc_sym_trad_nat,
           stc_add_manu = stc_add_manu_sub - stc_add_trad_nat,
           stc_net_manu = stc_net_manu_sub - stc_net_trad_nat,
           stc_tba_manu = stc_tba_manu_sub - stc_tba_trad_nat,
           
           stc_sym_malt = stc_sym_malt_sub - stc_sym_trad_nat,
           stc_add_malt = stc_add_malt_sub - stc_add_trad_nat,
           stc_net_malt = stc_net_malt_sub - stc_net_trad_nat,
           stc_tba_malt = stc_tba_malt_sub - stc_tba_trad_nat,
           
           stc_sym_maht = stc_sym_maht_sub - stc_sym_trad_nat,
           stc_add_maht = stc_add_maht_sub - stc_add_trad_nat,
           stc_net_maht = stc_net_maht_sub - stc_net_trad_nat,
           stc_tba_maht = stc_tba_maht_sub - stc_tba_trad_nat,
           
           stc_sym_sr_t = stc_sym_sr_t_sub - stc_sym_trad_nat,
           stc_add_sr_t = stc_add_sr_t_sub - stc_add_trad_nat,
           stc_net_sr_t = stc_net_sr_t_sub - stc_net_trad_nat,
           stc_tba_sr_t = stc_tba_sr_t_sub - stc_tba_trad_nat)
  
  # Calculate distance to next real observation
  dat_dist <- data.frame(country_short = character(),
                         year = numeric(),
                         distance = numeric())
  
  for (j in unique(stc_combined$subnational_code)){
    
    temp_dat <- stc_combined %>% filter(subnational_code == j) %>% ungroup() %>% dplyr::select(subnational_code, year, estimated)
    
    na <- which(temp_dat$estimated != 0)
    non_na <- which(temp_dat$estimated < 1)
    temp_dat$distance <- 0
    
    temp_dat$distance[na] <- abs(na - non_na[findInterval(na, (non_na[-length(non_na)] + non_na[-1]) / 2) + 1])
    
    dat_dist <- rbind(dat_dist, temp_dat %>% dplyr::select(-estimated))
    
  }
  
  stc_combined <- dplyr::left_join(stc_combined, dat_dist, by = c("subnational_code", "year"))
  
  #### Clean ####

  stc_combined <- stc_combined %>%
    dplyr::select(-contains("_sub"), -contains("_nat")) %>%
    dplyr::select(country_short:estimated, distance, count:stc_tba_sr_t) %>%
    ungroup() %>% 
    mutate_at(.tbl = ., .vars = vars(contains("stc_")), .funs = function(x){ifelse(.$count_trad < 50, NA, x)})
  
  stc <- rbind(stc, stc_combined)
  
  stc_raw <- rbind(stc_raw, dat_comp)
  
  print(paste0("Concluded: ",i,"/",length(inputFiles), "; ", tempCountry, " ", tempLevel))
  
}

stc <- dplyr::left_join(stc, stc %>% 
                           select(country_short, subnational_level, subnational_code) %>%
                           distinct() %>%
                           group_by(country_short, subnational_level) %>%
                           summarise(count = n()) %>%
                           ungroup() %>%
                           arrange(country_short, count) %>%
                           group_by(country_short) %>%
                           mutate(aggregation_level = 1:n()) %>%
                           select(-count),
                         by = c("country_short", "subnational_level")) %>%
  select(country_short, subnational_level, subnational_code, subnational_name, aggregation_level, everything())

stc_raw <- dplyr::left_join(stc_raw, stc %>% 
                              select(country_short, subnational_level, subnational_code) %>%
                              distinct() %>%
                              group_by(country_short, subnational_level) %>%
                              summarise(count = n()) %>%
                              ungroup() %>%
                              arrange(country_short, count) %>%
                              group_by(country_short) %>%
                              mutate(aggregation_level = 1:n()) %>%
                              select(-count),
                            by = c("country_short", "subnational_level")) %>%
  select(country_short, subnational_level, subnational_code, subnational_name, aggregation_level, everything())

saveRDS(stc, file = "stc.rds")
write.csv(stc, "stc.csv", row.names = F)

saveRDS(stc_raw, file = "not_shareable/stc_raw.rds")
